Yi & Bourzikas specializes in talent management solutions for Fortune 1000 companies focus on building and developing strategies for retaining employees. We specialize in workforce planning, employee training programs, identifying high-potential employees and reducing/preventing voluntary employee turnover (attrition). As part of this engagement, our data science team will predict for your organization.
employee <- read.csv("CaseStudy2-data.csv", na.strings = "NULL")
employeeValidation <- read.csv("CaseStudy2Validation.csv", na.strings = "NULL")
result <-rbind(employee,employeeValidation)
#Create 1/0 from Catagorical Variables
emp_train <- fastDummies::dummy_cols(employee) # Create Dummy Variables
emp_test <- fastDummies::dummy_cols(employeeValidation) # Create Dummy Variables
emp_result <- rbind(emp_test, emp_test) # combine train and test data sets
# Creating Variables
# Define Data Colums to Make it Easier
cols.Base <- c(2:36)
cols.CatAttr <- c(38:39)
cols.CatAll <- c(40:68)
col.NoJobRole <- c(1,2,5,7,8,10,12,14,15,18,20,21,22,25:36,40:42,52:53,63:68)
names(emp_result[,c(col.NoJobRole)])
## [1] "ID" "Age"
## [3] "DailyRate" "DistanceFromHome"
## [5] "Education" "EmployeeCount"
## [7] "EnvironmentSatisfaction" "HourlyRate"
## [9] "JobInvolvement" "JobSatisfaction"
## [11] "MonthlyIncome" "MonthlyRate"
## [13] "NumCompaniesWorked" "PercentSalaryHike"
## [15] "PerformanceRating" "RelationshipSatisfaction"
## [17] "StandardHours" "StockOptionLevel"
## [19] "TotalWorkingYears" "TrainingTimesLastYear"
## [21] "WorkLifeBalance" "YearsAtCompany"
## [23] "YearsInCurrentRole" "YearsSinceLastPromotion"
## [25] "YearsWithCurrManager" "BusinessTravel_Travel_Rarely"
## [27] "BusinessTravel_Travel_Frequently" "BusinessTravel_Non-Travel"
## [29] "Gender_Female" "Gender_Male"
## [31] "MaritalStatus_Married" "MaritalStatus_Divorced"
## [33] "MaritalStatus_Single" "Over18_Y"
## [35] "OverTime_No" "OverTime_Yes"
# Removed 17 From Data Set
cols.RemoveJobRoleCat <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,18,19,20,21,22,66,24,25,26,27,28,29,30,31,32,33,34,35,36)
# All Job Detailed Roles
cols.JobRoles <- c(54:62)
cols.AllButAttr <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,24,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,63,64,65,66,67,68)
# This is all the Catagorical Fields
cols.CatGLM <- c(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,66,24,25,26,27,28,29,30,31,32,33,34,35,36)
cols.CatKNN <- c(1,2,3,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)
cols.NumericAll <- c(1,2,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)
cols.Attrition <- 34
cols.KeyFieldsBaseModel <- c(40:42,7,12,63:65,22,67:68,27,30,31:36)
We received the data files from your employee database and have outlined some key highlights. The following charts are part of our exploratory data and will give your organization an idea of how different features in the data set apply. The pair plots show all the variables based on whether your employees have left the organisation.
# Basic EDA
#EDA - Exploratory Not for Report
pairs(emp_result[,c(2:5)], col=emp_train$Attrition)
pairs(emp_result[,c(6:10)], col=emp_train$Attrition)
pairs(emp_result[,c(11:15)], col=emp_train$Attrition)
pairs(emp_result[,c(16:20)], col=emp_train$Attrition)
pairs(emp_result[,c(21:25)], col=emp_train$Attrition)
pairs(emp_result[,c(26:30)], col=emp_train$Attrition)
pairs(emp_result[,c(31:35)], col=emp_train$Attrition)
pairs(emp_result[,c(36:40)], col=emp_train$Attrition)
Because of the data that we were able to analyze as part of the Par Plots above, we developed 2 Heat Maps and Correlations and Distribution Matrix to take a deeper dive in the data set.
# Heat Map for All Fields
employeeHeatMap <- round(cor(emp_result[,c(cols.NumericAll)]),2)
## Warning in cor(emp_result[, c(cols.NumericAll)]): the standard deviation is
## zero
melted_employeeHeatMap <- melt(employeeHeatMap)
ggplot(data = melted_employeeHeatMap, aes(x=X1, y=X2, fill=value)) +
theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.title.x=element_blank(),axis.text.y = element_text(size = 7))+geom_tile()
#ggsave("images/employeeHeatMap.png",plot = last_plot(), type = png())
# Heat Map for Key Sign Fields
employeeHeatMapSig <- round(cor(emp_result[,c(cols.KeyFieldsBaseModel)]),2)
melted_employeeHeatMapSig <- melt(employeeHeatMapSig)
ggplot(data = melted_employeeHeatMapSig, aes(x=X1, y=X2, fill=value)) +
theme(axis.text.x = element_blank(),axis.ticks.x=element_blank(),axis.title.x=element_blank(),axis.text.y = element_text(size = 7))+
geom_tile()
#ggsave("images/employeeHeatMapSig.png",plot = last_plot(), type = png())
# EDA For Key Sign Fields on Attrition for Overall Model
ggkeySignPairs <- ggpairs(
mapping = ggplot2::aes(color = emp_result$Attrition),
emp_result[,c(cols.KeyFieldsBaseModel)],
diag=list(continuous="densityDiag", discrete="barDiag"),
axisLabels="show") + theme_minimal()
#ggsave("ggkeySignPairs.png",plot = last_plot(), type = png())
Showing the significance, p-values, for each variable. This is the base information to understand the key contributors that affect attrition. This data will be used through our data science work so we can utilize the following variables for the models we build.
The following table outlines the top significant factors contributing to attrition:
#TrainDataSet
glm_modeltrain <- glm(emp_train$Attrition~.,emp_train[,c(cols.CatGLM)], family = binomial) # glm train
model_Train = data.frame(coef(summary(glm_modeltrain))[,4]) # pvalue from glm train
names(model_Train) = "Logistic Regressio on Training Set" # title
#TestDataSet
glm_modeltest <- glm(emp_test$Attrition~.,emp_test[,c(cols.CatGLM)], family = binomial) # glm test
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_Test = data.frame(coef(summary(glm_modeltest))[,4]) # pvalue from glm test
names(model_Test) = "Logistic Regression Test on Test" # title
#AllData
glm_modelAll <- glm(emp_result$Attrition~.,emp_result[,c(cols.CatGLM)], family = binomial) # glm for all combined test and train data set
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
model_All = data.frame(coef(summary(glm_modelAll))[,4]) # pvalue fro combined data set
names(model_All) = "Logistic Regressio on All Data" # title
# Table consolidated
GLM_dataset <-cbind(model_Train, model_Test,model_All) # consolidated train, test and all data set
# Creating kable table for GLM dataset
GLM_dataset %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "600px", height = "450px")
| Logistic Regressio on Training Set | Logistic Regression Test on Test | Logistic Regressio on All Data | |
|---|---|---|---|
| (Intercept) | 0.9856239 | 0.9989900 | 0.9985716 |
| ID | 0.7413226 | 0.5564256 | 0.4055318 |
| Age | 0.2081840 | 0.0156765 | 0.0006326 |
| BusinessTravelTravel_Frequently | 0.0000674 | 0.0010245 | 0.0000034 |
| BusinessTravelTravel_Rarely | 0.0257045 | 0.0164520 | 0.0006930 |
| DailyRate | 0.1137743 | 0.0637612 | 0.0087484 |
| DepartmentResearch & Development | 0.9840834 | 0.9999149 | 0.9998797 |
| DepartmentSales | 0.9855939 | 0.9982575 | 0.9975358 |
| DistanceFromHome | 0.0003241 | 0.0022409 | 0.0000154 |
| Education | 0.8266214 | 0.1306817 | 0.0325598 |
| EducationFieldLife Sciences | 0.9750329 | 0.3202993 | 0.1598708 |
| EducationFieldMarketing | 0.8437994 | 0.5521055 | 0.4004024 |
| EducationFieldMedical | 0.9098803 | 0.2537434 | 0.1065171 |
| EducationFieldOther | 0.6960025 | 0.9285191 | 0.8990460 |
| EducationFieldTechnical Degree | 0.4683605 | 0.4508936 | 0.2863276 |
| EmployeeNumber | 0.9027209 | 0.4249982 | 0.2592223 |
| EnvironmentSatisfaction | 0.0000007 | 0.0800222 | 0.0132989 |
| GenderMale | 0.0660375 | 0.1909220 | 0.0643729 |
| HourlyRate | 0.2337414 | 0.3492105 | 0.1855442 |
| JobInvolvement | 0.0000060 | 0.8371013 | 0.7712302 |
| JobLevel | 0.3421974 | 0.0316539 | 0.0023752 |
| JobRoleHuman Resources | 0.9829785 | 0.9981111 | 0.9973286 |
| JobRoleLaboratory Technician | 0.0127075 | 0.9921081 | 0.9888393 |
| JobRoleManager | 0.8653399 | 0.9989564 | 0.9985241 |
| JobRoleManufacturing Director | 0.8918567 | 0.9931527 | 0.9903166 |
| JobRoleResearch Director | 0.0349614 | 0.9909768 | 0.9872395 |
| JobRoleResearch Scientist | 0.8565130 | 0.9923366 | 0.9891625 |
| JobRoleSales Executive | 0.1372067 | 0.9999902 | 0.9999861 |
| JobRoleSales Representative | 0.0299926 | 0.9995330 | 0.9993395 |
| JobSatisfaction | 0.0000192 | 0.0068582 | 0.0001315 |
| MaritalStatusMarried | 0.0601860 | 0.4269320 | 0.2612143 |
| MaritalStatusSingle | 0.0008135 | 0.1077738 | 0.0229388 |
| MonthlyIncome | 0.2085435 | 0.0074394 | 0.0001536 |
| MonthlyRate | 0.4484720 | 0.0070837 | 0.0001399 |
| NumCompaniesWorked | 0.0000051 | 0.0129331 | 0.0004395 |
| OverTimeYes | 0.0000000 | 0.0000101 | 0.0000000 |
| PercentSalaryHike | 0.6585720 | 0.5792289 | 0.4329243 |
| PerformanceRating | 0.7420886 | 0.0477017 | 0.0051076 |
| RelationshipSatisfaction | 0.0363799 | 0.0009600 | 0.0000030 |
| StockOptionLevel | 0.2540503 | 0.7718614 | 0.6817778 |
| TotalWorkingYears | 0.0141019 | 0.6265806 | 0.4914031 |
| TrainingTimesLastYear | 0.0335913 | 0.2194417 | 0.0824447 |
| WorkLifeBalance | 0.0006481 | 0.7803786 | 0.6933455 |
| YearsAtCompany | 0.0107927 | 0.1789009 | 0.0573137 |
| YearsInCurrentRole | 0.0034318 | 0.0492443 | 0.0054190 |
| YearsSinceLastPromotion | 0.0000925 | 0.0064181 | 0.0001159 |
| YearsWithCurrManager | 0.0131109 | 0.0292065 | 0.0020427 |
Learning about any job role specific trends that may exist in the data set is key because it tells us which variables are significant by job. This data can be used to identify what affects will contribute to attrition rate by job. Any value that is < 0.5 is significant
From the tables below, each job description will show the key attributes for attrition:
# Glm for Job role - Human Resources
glm_model_JobRoleHR <- glm(emp_result$`JobRole_Human Resources`~.,emp_result[,c(col.NoJobRole)], family = binomial) # glm
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleHR = data.frame(coef(summary(glm_model_JobRoleHR))[,4]) # pulling only pvalue from the glm
names(JobRoleHR) = "Human Resources" # creating title
# Glm for Job role - Manufactoring Director
glm_model_JobRoleManufactoring <- glm(emp_result$`JobRole_Manufacturing Director`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleManufactoring = data.frame(coef(summary(glm_model_JobRoleManufactoring))[,4])# pulling only pvalue from the glm
names(JobRoleManufactoring) = "Manufacturing Director" # creating title
# Glm for Job role - Research Scientist
glm_model_JobRoleResearch <- glm(emp_result$`JobRole_Research Scientist`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleResearch = data.frame(coef(summary(glm_model_JobRoleResearch))[,4])# pulling only pvalue from the glm
names(JobRoleResearch) = "Research Scientist" # creating title
# Glm for Job role - Lab Tech
glm_model_JobRoleLab <- glm(emp_result$`JobRole_Laboratory Technician`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleLab = data.frame(coef(summary(glm_model_JobRoleLab))[,4])# pulling only pvalue from the glm
names(JobRoleLab) = "Laboratory Technician" # creating title
# Glm for Job role - Research Director
glm_model_JobRoleResearchDirector <- glm(emp_result$`JobRole_Research Director`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleRD = data.frame(coef(summary(glm_model_JobRoleResearchDirector))[,4])# pulling only pvalue from the glm
names(JobRoleRD) = "Research Director" # creating title
# Glm for Job role - Sales Exec
glm_model_JobRoleSalesExec <- glm(emp_result$`JobRole_Sales Executive`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleSE = data.frame(coef(summary(glm_model_JobRoleSalesExec))[,4])# pulling only pvalue from the glm
names(JobRoleSE) = "Sales Executive" # creating title
# Glm for Job role - Sales Person
glm_model_JobRoleSalesPerson <- glm(emp_result$`JobRole_Sales Representative`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleSP = data.frame(coef(summary(glm_model_JobRoleSalesPerson))[,4])# pulling only pvalue from the glm
names(JobRoleSP) = "Sales Representative" # creating title
# Glm for Job role - Manager
glm_model_JobRoleManager <- glm(emp_result$JobRole_Manager~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
JobRoleManager = data.frame(coef(summary(glm_model_JobRoleManager))[,4])# pulling only pvalue from the glm
names(JobRoleManager) = "Manager" # creating title
# Glm for Job role - HealthCare
glm_model_JobRoleHealth <- glm(emp_result$`JobRole_Healthcare Representative`~.,emp_result[,c(col.NoJobRole)], family = binomial)# glm
JobRoleHealthR = data.frame(coef(summary(glm_model_JobRoleHealth))[,4])# pulling only pvalue from the glm
names(JobRoleHealthR) = "Healthcare Representative" # creating title
# Gener by Job Role
glm_model_Gender <- glm(emp_train$Gender~.,emp_train[,c(cols.JobRoles)], family = binomial)# glm
Gender_Model = data.frame(coef(summary(glm_model_Gender))[,4])# pulling only pvalue from the glm
names(Gender_Model) = "Gender" # creating title
# Marital Status by Role
glm_model_Marital <- glm(emp_train$MaritalStatus~.,emp_train[,c(cols.JobRoles)], family = binomial)# glm
Marital_Model = data.frame(coef(summary(glm_model_Marital))[,4])# pulling only pvalue from the glm
names(Marital_Model) = "Marital Status" # creating title
# Consolidated all the job role glm
Table.glm <-cbind(JobRoleHR, JobRoleManufactoring,JobRoleResearch,JobRoleLab,JobRoleRD,JobRoleSE,JobRoleManager)
# kable output for the consolidated glm
Table.glm %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "800px", height = "450px")
| Human Resources | Manufacturing Director | Research Scientist | Laboratory Technician | Research Director | Sales Executive | Manager | |
|---|---|---|---|---|---|---|---|
| (Intercept) | 0.9898902 | 0.8562672 | 0.0105992 | 0.7634204 | 0.9978978 | 0.0476180 | 0.9978271 |
| ID | 0.7800491 | 0.0850500 | 0.0037740 | 0.3127304 | 0.9980976 | 0.1538642 | 0.9998129 |
| Age | 0.5623287 | 0.0011058 | 0.1447794 | 0.0967646 | 0.9980432 | 0.0913818 | 0.9981136 |
| DailyRate | 0.0566140 | 0.1108373 | 0.4081995 | 0.2371188 | 0.9968457 | 0.5493793 | 0.9956510 |
| DistanceFromHome | 0.8972205 | 0.9493385 | 0.2216075 | 0.5860499 | 0.9994825 | 0.6118137 | 0.9998753 |
| Education | 0.0073892 | 0.2810289 | 0.0009405 | 0.8614410 | 0.9899819 | 0.6872301 | 0.9971418 |
| EnvironmentSatisfaction | 0.2027087 | 0.3123112 | 0.0141533 | 0.0043965 | 0.9915696 | 0.0302721 | 0.9994402 |
| HourlyRate | 0.3740712 | 0.9479455 | 0.0121477 | 0.3686677 | 0.9947282 | 0.5606605 | 0.9988406 |
| JobInvolvement | 0.7070367 | 0.1618613 | 0.3183384 | 0.7401576 | 0.9965561 | 0.1738210 | 0.9993601 |
| JobSatisfaction | 0.1048157 | 0.0776753 | 0.3254296 | 0.5401725 | 0.9989589 | 0.6174958 | 0.9992501 |
| MonthlyIncome | 0.0224822 | 0.2626945 | 0.0000000 | 0.0000000 | 0.9845461 | 0.0011392 | 0.9954927 |
| MonthlyRate | 0.5427228 | 0.4894283 | 0.1768713 | 0.7061777 | 0.9925435 | 0.0915706 | 0.9986764 |
| NumCompaniesWorked | 0.0203893 | 0.7453919 | 0.1440189 | 0.1403865 | 0.9938553 | 0.2820852 | 0.9975385 |
| PercentSalaryHike | 0.0143982 | 0.0682091 | 0.3278690 | 0.0632794 | 0.9957684 | 0.9146964 | 0.9990989 |
| PerformanceRating | 0.0648621 | 0.7289152 | 0.9049755 | 0.0154966 | 0.9956879 | 0.4222443 | 0.9979472 |
| RelationshipSatisfaction | 0.0000869 | 0.5796037 | 0.8065068 | 0.0524516 | 0.9885338 | 0.0024727 | 0.9991710 |
| StockOptionLevel | 0.1236563 | 0.8992230 | 0.2760117 | 0.7811183 | 0.9981700 | 0.8869781 | 0.9998946 |
| TotalWorkingYears | 0.6211428 | 0.8135194 | 0.3365269 | 0.5438628 | 0.9984232 | 0.0000230 | 0.9996878 |
| TrainingTimesLastYear | 0.0808694 | 0.6323765 | 0.1939522 | 0.1884160 | 0.9937384 | 0.1884307 | 0.9993732 |
| WorkLifeBalance | 0.0645333 | 0.5504761 | 0.3950650 | 0.6004307 | 0.9986976 | 0.3066064 | 0.9982452 |
| YearsAtCompany | 0.5449955 | 0.3139114 | 0.0648560 | 0.7427766 | 0.9946482 | 0.9130788 | 0.9986172 |
| YearsInCurrentRole | 0.0901148 | 0.1030673 | 0.0498326 | 0.7287665 | 0.9974437 | 0.8042547 | 0.9981619 |
| YearsSinceLastPromotion | 0.2196358 | 0.3914967 | 0.9653856 | 0.3582245 | 0.9985325 | 0.4563314 | 0.9998616 |
| YearsWithCurrManager | 0.9564791 | 0.6897727 | 0.0235609 | 0.3843670 | 0.9956744 | 0.0044464 | 0.9987819 |
| BusinessTravel_Travel_Rarely | 0.4988297 | 0.1195384 | 0.0095628 | 0.0568347 | 0.9991386 | 0.1092909 | 0.9987905 |
| BusinessTravel_Travel_Frequently | 0.0291885 | 0.4796807 | 0.3334086 | 0.9489262 | 0.9974423 | 0.4305227 | 0.9997462 |
| Gender_Female | 0.2969850 | 0.1212829 | 0.0529954 | 0.0003685 | 0.9934974 | 0.1012798 | 0.9970246 |
| MaritalStatus_Married | 0.9873648 | 0.2143762 | 0.1368206 | 0.3113332 | 0.9988253 | 0.6662205 | 0.9996602 |
| MaritalStatus_Divorced | 0.9879489 | 0.8211764 | 0.4981004 | 0.7711347 | 0.9991421 | 0.3085217 | 0.9988060 |
| OverTime_No | 0.0034983 | 0.3745554 | 0.5164776 | 0.3319150 | 0.9925320 | 0.1718642 | 0.9996636 |
# kable output for the Gender by Job Role glm
Gender_Model %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "500px", height = "450px")
| Gender | |
|---|---|
| (Intercept) | 0.0922882 |
JobRole_Human Resources
|
0.3306074 |
JobRole_Manufacturing Director
|
0.3235260 |
JobRole_Research Scientist
|
0.4690106 |
JobRole_Laboratory Technician
|
0.1812029 |
JobRole_Research Director
|
0.3882230 |
JobRole_Sales Executive
|
0.5296589 |
JobRole_Sales Representative
|
0.9953475 |
| JobRole_Manager | 0.5411048 |
# kable output for the Martial Status Role glm
Marital_Model %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "500px", height = "450px")
| Marital Status | |
|---|---|
| (Intercept) | 0.0000014 |
JobRole_Human Resources
|
0.9587634 |
JobRole_Manufacturing Director
|
0.9664449 |
JobRole_Research Scientist
|
0.5597292 |
JobRole_Laboratory Technician
|
0.5325639 |
JobRole_Research Director
|
0.4853543 |
JobRole_Sales Executive
|
0.7642466 |
JobRole_Sales Representative
|
0.0541812 |
| JobRole_Manager | 0.5038035 |
Running the full KNN model using the training and test data set. The full KNN model came out to have a high accuracy rate of 84%, from there we decided to run the KNN model by job role. The glm showed us that each job has different variables of significance, so the KNN by job reflects different variables that pertains to that specific role.
As a key note, KNN works better with larger data sets than splitting them into job positions.
# KNN
set.seed(123)
#knn.train = train(Attrition~., data=emp_train[,c(cols.CatKNN)], method="knn", trControl=control, tuneGrid=grid1)
knn.train = train(Attrition~., data=emp_train[,c(cols.CatKNN)], method="knn")
knn.train
## k-Nearest Neighbors
##
## 1170 samples
## 56 predictor
## 2 classes: 'No', 'Yes'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 1170, 1170, 1170, 1170, 1170, 1170, ...
## Resampling results across tuning parameters:
##
## k Accuracy Kappa
## 5 0.7782905 0.008056618
## 7 0.7956919 0.005752565
## 9 0.8099105 0.010861496
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was k = 9.
#Set K=18 sq of 1480
knn.test = knn(emp_train[,c(cols.CatKNN)][,-3], emp_test[,c(cols.CatKNN)][,-3], emp_train[,c(cols.CatKNN)][,3], k=18)
knnPrediction <-confusionMatrix(table(knn.test, emp_test$Attrition))
knnPrediction
## Confusion Matrix and Statistics
##
##
## knn.test No Yes
## No 251 48
## Yes 0 1
##
## Accuracy : 0.84
## 95% CI : (0.7935, 0.8796)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.4758
##
## Kappa : 0.0337
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.00000
## Specificity : 0.02041
## Pos Pred Value : 0.83946
## Neg Pred Value : 1.00000
## Prevalence : 0.83667
## Detection Rate : 0.83667
## Detection Prevalence : 0.99667
## Balanced Accuracy : 0.51020
##
## 'Positive' Class : No
##
fourfoldplot(knnPrediction$table)
Running the Weighted KNN model using the training and test data set. The Weighted KNN model came out to have a higher accuracy rate of 84.4 than the KNN which was 84%. Additional, the plot below shows the optimal K which is 30.
# K Weighted
set.seed(123)
#performs leave-one-out crossvalidation
kknn.train = train.kknn(Attrition~., data=emp_train[,c(cols.CatKNN)], kmax=30, distance = 2)
#Predict Attribution
prediction <- predict(kknn.train, emp_test[,c(cols.CatKNN)][,-3])
#Show Confusion Matrix
kWeightedPrediction <- confusionMatrix(table(prediction, emp_test[,c(cols.CatKNN)][,3]))
kWeightedPrediction
## Confusion Matrix and Statistics
##
##
## prediction No Yes
## No 251 47
## Yes 0 2
##
## Accuracy : 0.8433
## 95% CI : (0.7972, 0.8826)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.4139
##
## Kappa : 0.0665
## Mcnemar's Test P-Value : 1.949e-11
##
## Sensitivity : 1.00000
## Specificity : 0.04082
## Pos Pred Value : 0.84228
## Neg Pred Value : 1.00000
## Prevalence : 0.83667
## Detection Rate : 0.83667
## Detection Prevalence : 0.99333
## Balanced Accuracy : 0.52041
##
## 'Positive' Class : No
##
fourfoldplot(kWeightedPrediction$table)
# Plot Prediction for number of K
graphics.off()
par(mar=c(5,5,5,5))
plot(kknn.train)
The following model is logistic regression the test and training set for all the fields. For logistic regression to work, the data was formatted to numeric data and setup with a prediction interval in which we converted a probability. In this model, we predicted at an 87% rate.
# Logistic Regression (No Lasso) - Winning Model
#Base Model
glm_model <- glm(emp_train$Attrition~.,data = emp_train[,c(cols.CatGLM)], family = binomial)
summary(glm_model)
##
## Call:
## glm(formula = emp_train$Attrition ~ ., family = binomial, data = emp_train[,
## c(cols.CatGLM)])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5831 -0.4893 -0.2395 -0.0824 3.2351
##
## Coefficients: (3 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.254e+01 6.957e+02 -0.018 0.985624
## ID 9.763e-05 2.958e-04 0.330 0.741323
## Age -1.917e-02 1.523e-02 -1.259 0.208184
## BusinessTravelTravel_Frequently 1.858e+00 4.663e-01 3.985 6.74e-05
## BusinessTravelTravel_Rarely 9.574e-01 4.292e-01 2.231 0.025704
## DailyRate -3.949e-04 2.497e-04 -1.581 0.113774
## DepartmentResearch & Development 1.388e+01 6.957e+02 0.020 0.984083
## DepartmentSales 1.256e+01 6.957e+02 0.018 0.985594
## DistanceFromHome 4.361e-02 1.213e-02 3.595 0.000324
## Education 2.211e-02 1.009e-01 0.219 0.826621
## EducationFieldLife Sciences -3.611e-02 1.154e+00 -0.031 0.975033
## EducationFieldMarketing 2.371e-01 1.203e+00 0.197 0.843799
## EducationFieldMedical -1.310e-01 1.157e+00 -0.113 0.909880
## EducationFieldOther -4.750e-01 1.216e+00 -0.391 0.696003
## EducationFieldTechnical Degree 8.451e-01 1.165e+00 0.725 0.468361
## EmployeeCount NA NA NA NA
## EmployeeNumber -2.082e-05 1.704e-04 -0.122 0.902721
## EnvironmentSatisfaction -4.711e-01 9.506e-02 -4.956 7.19e-07
## GenderMale 3.846e-01 2.092e-01 1.838 0.066038
## HourlyRate 6.008e-03 5.046e-03 1.191 0.233741
## JobInvolvement -6.303e-01 1.392e-01 -4.526 6.00e-06
## JobLevel -3.281e-01 3.454e-01 -0.950 0.342197
## JobRoleHuman Resources 1.484e+01 6.957e+02 0.021 0.982979
## JobRoleLaboratory Technician 1.284e+00 5.151e-01 2.492 0.012707
## JobRoleManager 1.561e-01 9.205e-01 0.170 0.865340
## JobRoleManufacturing Director -7.803e-02 5.739e-01 -0.136 0.891857
## JobRoleResearch Director -2.824e+00 1.339e+00 -2.109 0.034961
## JobRoleResearch Scientist 9.597e-02 5.307e-01 0.181 0.856513
## JobRoleSales Executive 2.046e+00 1.376e+00 1.486 0.137207
## JobRoleSales Representative 3.097e+00 1.427e+00 2.170 0.029993
## JobSatisfaction -3.917e-01 9.164e-02 -4.274 1.92e-05
## MaritalStatusMarried 5.912e-01 3.146e-01 1.879 0.060186
## MaritalStatusSingle 1.334e+00 3.984e-01 3.348 0.000814
## MonthlyIncome 1.129e-04 8.975e-05 1.258 0.208543
## MonthlyRate -1.078e-05 1.423e-05 -0.758 0.448472
## NumCompaniesWorked 2.006e-01 4.399e-02 4.560 5.12e-06
## Over18_Y NA NA NA NA
## OverTimeYes 1.959e+00 2.216e-01 8.841 < 2e-16
## PercentSalaryHike -1.991e-02 4.505e-02 -0.442 0.658572
## PerformanceRating 1.509e-01 4.586e-01 0.329 0.742089
## RelationshipSatisfaction -1.950e-01 9.319e-02 -2.093 0.036380
## StandardHours NA NA NA NA
## StockOptionLevel -1.985e-01 1.740e-01 -1.141 0.254050
## TotalWorkingYears -8.257e-02 3.364e-02 -2.455 0.014102
## TrainingTimesLastYear -1.768e-01 8.321e-02 -2.125 0.033591
## WorkLifeBalance -4.776e-01 1.400e-01 -3.411 0.000648
## YearsAtCompany 1.105e-01 4.333e-02 2.549 0.010793
## YearsInCurrentRole -1.479e-01 5.056e-02 -2.926 0.003432
## YearsSinceLastPromotion 1.862e-01 4.763e-02 3.909 9.25e-05
## YearsWithCurrManager -1.237e-01 4.987e-02 -2.481 0.013111
##
## (Intercept)
## ID
## Age
## BusinessTravelTravel_Frequently ***
## BusinessTravelTravel_Rarely *
## DailyRate
## DepartmentResearch & Development
## DepartmentSales
## DistanceFromHome ***
## Education
## EducationFieldLife Sciences
## EducationFieldMarketing
## EducationFieldMedical
## EducationFieldOther
## EducationFieldTechnical Degree
## EmployeeCount
## EmployeeNumber
## EnvironmentSatisfaction ***
## GenderMale .
## HourlyRate
## JobInvolvement ***
## JobLevel
## JobRoleHuman Resources
## JobRoleLaboratory Technician *
## JobRoleManager
## JobRoleManufacturing Director
## JobRoleResearch Director *
## JobRoleResearch Scientist
## JobRoleSales Executive
## JobRoleSales Representative *
## JobSatisfaction ***
## MaritalStatusMarried .
## MaritalStatusSingle ***
## MonthlyIncome
## MonthlyRate
## NumCompaniesWorked ***
## Over18_Y
## OverTimeYes ***
## PercentSalaryHike
## PerformanceRating
## RelationshipSatisfaction *
## StandardHours
## StockOptionLevel
## TotalWorkingYears *
## TrainingTimesLastYear *
## WorkLifeBalance ***
## YearsAtCompany *
## YearsInCurrentRole **
## YearsSinceLastPromotion ***
## YearsWithCurrManager *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1031.48 on 1169 degrees of freedom
## Residual deviance: 674.24 on 1123 degrees of freedom
## AIC: 768.24
##
## Number of Fisher Scoring iterations: 15
#predict probabilities on testset
#type="response" gives probabilities, type="class" gives class
glm_prob <- predict.glm(glm_model,emp_test[,-3],type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
#which classes do these probabilities refer to? What are 1 and 0?
contrasts(emp_test$Attrition)
## Yes
## No 0
## Yes 1
#make predictions
##.first create vector to hold predictions (we know 0 refers to neg now)
dfTrain <- rep("No",nrow(emp_test))
dfTrain[glm_prob>.5] <- "Yes"
#confusion matrix
LogRegOnly <-confusionMatrix(table(pred=dfTrain,true=emp_test$Attrition))
LogRegOnly
## Confusion Matrix and Statistics
##
## true
## pred No Yes
## No 244 32
## Yes 7 17
##
## Accuracy : 0.87
## 95% CI : (0.8266, 0.9059)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.0658297
##
## Kappa : 0.4015
## Mcnemar's Test P-Value : 0.0001215
##
## Sensitivity : 0.9721
## Specificity : 0.3469
## Pos Pred Value : 0.8841
## Neg Pred Value : 0.7083
## Prevalence : 0.8367
## Detection Rate : 0.8133
## Detection Prevalence : 0.9200
## Balanced Accuracy : 0.6595
##
## 'Positive' Class : No
##
fourfoldplot(LogRegOnly$table)
dfPreds = data.frame(emp_test$ID,dfTrain)
colnames(dfPreds) = c("ID","Prediction")
dfPreds
## ID Prediction
## 1 1171 No
## 2 1172 No
## 3 1173 No
## 4 1174 No
## 5 1175 No
## 6 1176 No
## 7 1177 No
## 8 1178 No
## 9 1179 No
## 10 1180 No
## 11 1181 No
## 12 1182 No
## 13 1183 No
## 14 1184 No
## 15 1185 No
## 16 1186 No
## 17 1187 No
## 18 1188 No
## 19 1189 Yes
## 20 1190 No
## 21 1191 No
## 22 1192 No
## 23 1193 No
## 24 1194 No
## 25 1195 No
## 26 1196 Yes
## 27 1197 No
## 28 1198 No
## 29 1199 Yes
## 30 1200 No
## 31 1201 No
## 32 1202 Yes
## 33 1203 No
## 34 1204 No
## 35 1205 No
## 36 1206 No
## 37 1207 No
## 38 1208 No
## 39 1209 No
## 40 1210 No
## 41 1211 No
## 42 1212 No
## 43 1213 No
## 44 1214 No
## 45 1215 No
## 46 1216 No
## 47 1217 Yes
## 48 1218 No
## 49 1219 No
## 50 1220 No
## 51 1221 No
## 52 1222 No
## 53 1223 No
## 54 1224 No
## 55 1225 No
## 56 1226 No
## 57 1227 No
## 58 1228 No
## 59 1229 No
## 60 1230 No
## 61 1231 No
## 62 1232 No
## 63 1233 No
## 64 1234 No
## 65 1235 No
## 66 1236 No
## 67 1237 No
## 68 1238 No
## 69 1239 No
## 70 1240 No
## 71 1241 No
## 72 1242 No
## 73 1243 No
## 74 1244 No
## 75 1245 No
## 76 1246 No
## 77 1247 No
## 78 1248 Yes
## 79 1249 No
## 80 1250 No
## 81 1251 No
## 82 1252 No
## 83 1253 Yes
## 84 1254 No
## 85 1255 No
## 86 1256 No
## 87 1257 No
## 88 1258 No
## 89 1259 No
## 90 1260 No
## 91 1261 No
## 92 1262 No
## 93 1263 No
## 94 1264 No
## 95 1265 No
## 96 1266 No
## 97 1267 No
## 98 1268 Yes
## 99 1269 No
## 100 1270 No
## 101 1271 No
## 102 1272 No
## 103 1273 No
## 104 1274 Yes
## 105 1275 No
## 106 1276 No
## 107 1277 No
## 108 1278 No
## 109 1279 No
## 110 1280 No
## 111 1281 No
## 112 1282 No
## 113 1283 No
## 114 1284 No
## 115 1285 No
## 116 1286 No
## 117 1287 No
## 118 1288 Yes
## 119 1289 No
## 120 1290 No
## 121 1291 No
## 122 1292 Yes
## 123 1293 No
## 124 1294 No
## 125 1295 No
## 126 1296 No
## 127 1297 No
## 128 1298 No
## 129 1299 No
## 130 1300 No
## 131 1301 No
## 132 1302 No
## 133 1303 No
## 134 1304 Yes
## 135 1305 No
## 136 1306 No
## 137 1307 No
## 138 1308 No
## 139 1309 No
## 140 1310 No
## 141 1311 No
## 142 1312 No
## 143 1313 No
## 144 1314 No
## 145 1315 No
## 146 1316 No
## 147 1317 No
## 148 1318 No
## 149 1319 No
## 150 1320 No
## 151 1321 No
## 152 1322 No
## 153 1323 No
## 154 1324 No
## 155 1325 No
## 156 1326 No
## 157 1327 No
## 158 1328 No
## 159 1329 No
## 160 1330 No
## 161 1331 No
## 162 1332 Yes
## 163 1333 No
## 164 1334 Yes
## 165 1335 No
## 166 1336 No
## 167 1337 No
## 168 1338 No
## 169 1339 No
## 170 1340 Yes
## 171 1341 No
## 172 1342 No
## 173 1343 No
## 174 1344 No
## 175 1345 No
## 176 1346 No
## 177 1347 No
## 178 1348 Yes
## 179 1349 Yes
## 180 1350 No
## 181 1351 No
## 182 1352 No
## 183 1353 No
## 184 1354 No
## 185 1355 No
## 186 1356 No
## 187 1357 No
## 188 1358 No
## 189 1359 No
## 190 1360 No
## 191 1361 Yes
## 192 1362 No
## 193 1363 No
## 194 1364 No
## 195 1365 Yes
## 196 1366 No
## 197 1367 No
## 198 1368 No
## 199 1369 No
## 200 1370 No
## 201 1371 No
## 202 1372 No
## 203 1373 No
## 204 1374 No
## 205 1375 No
## 206 1376 No
## 207 1377 No
## 208 1378 No
## 209 1379 No
## 210 1380 No
## 211 1381 No
## 212 1382 No
## 213 1383 No
## 214 1384 No
## 215 1385 No
## 216 1386 No
## 217 1387 No
## 218 1388 No
## 219 1389 No
## 220 1390 No
## 221 1391 No
## 222 1392 No
## 223 1393 No
## 224 1394 No
## 225 1395 No
## 226 1396 No
## 227 1397 No
## 228 1398 No
## 229 1399 No
## 230 1400 Yes
## 231 1401 No
## 232 1402 No
## 233 1403 No
## 234 1404 No
## 235 1405 No
## 236 1406 No
## 237 1407 No
## 238 1408 No
## 239 1409 No
## 240 1410 No
## 241 1411 Yes
## 242 1412 Yes
## 243 1413 No
## 244 1414 No
## 245 1415 No
## 246 1416 No
## 247 1417 No
## 248 1418 No
## 249 1419 No
## 250 1420 No
## 251 1421 No
## 252 1422 No
## 253 1423 No
## 254 1424 No
## 255 1425 No
## 256 1426 No
## 257 1427 No
## 258 1428 No
## 259 1429 No
## 260 1430 No
## 261 1431 No
## 262 1432 No
## 263 1433 No
## 264 1434 No
## 265 1435 No
## 266 1436 Yes
## 267 1437 No
## 268 1438 No
## 269 1439 No
## 270 1440 No
## 271 1441 No
## 272 1442 No
## 273 1443 No
## 274 1444 No
## 275 1445 No
## 276 1446 No
## 277 1447 No
## 278 1448 No
## 279 1449 No
## 280 1450 No
## 281 1451 No
## 282 1452 No
## 283 1453 No
## 284 1454 No
## 285 1455 No
## 286 1456 No
## 287 1457 No
## 288 1458 No
## 289 1459 No
## 290 1460 Yes
## 291 1461 No
## 292 1462 No
## 293 1463 No
## 294 1464 No
## 295 1465 No
## 296 1466 No
## 297 1467 No
## 298 1468 No
## 299 1469 No
## 300 1470 No
write.csv(dfPreds,file = "LabelPrediction.csv",row.names = FALSE)
Adding to the the Logistic Regression model above, Logistic Regression Using Lasso was used for automated feature selection. As part of this model, we discovered that only 6 variables predict at 86% vs general logistic regression of 87% that utilizes all fields in the data set. Because the Logistic_Regression Using Lasso is more efficient, our recommendation is to run the logistic regression model using LASSO because it is more efficient and only 1% less prediction.
Just because Lasso was used as a feature selection, it is essential to Validate the numbers that come out of the LASSO model. From the Automated LASSO feature selection, it was discovered through down selects that there is a better model.
#Begin Logistic Regression with Lasso
#convert training data to matrix format
set.seed(123)
x <- model.matrix(emp_train$Attrition~.,emp_train[,c(cols.CatGLM)])
y <- ifelse(emp_train$Attrition=="Yes",1,0)
# Run Base Model
glm.lasso.new <- cv.glmnet(x,y,alpha=1,family="binomial",type.measure = "mse")
plot(glm.lasso.new)
#min value of lambda
lambda_min <- glm.lasso.new$lambda.min
#best value of lambda
lambda_1se <- glm.lasso.new$lambda.1se
#regression coefficients
glm.lasso.new.coef <- coef(glm.lasso.new,s=lambda_1se)
data.frame(name = glm.lasso.new.coef@Dimnames[[1]][glm.lasso.new.coef@i + 1], coefficient = glm.lasso.new.coef@x)
## name coefficient
## 1 (Intercept) 1.5071026023
## 2 Age -0.0104129673
## 3 BusinessTravelTravel_Frequently 0.5679751924
## 4 DailyRate -0.0001563157
## 5 DepartmentResearch & Development -0.3175946691
## 6 DepartmentSales 0.0101192638
## 7 DistanceFromHome 0.0181403082
## 8 EducationFieldMarketing 0.0664777807
## 9 EducationFieldOther -0.0255696530
## 10 EducationFieldTechnical Degree 0.4419704188
## 11 EnvironmentSatisfaction -0.3050329917
## 12 GenderMale 0.1548217727
## 13 JobInvolvement -0.4297487944
## 14 JobRoleLaboratory Technician 0.6614081416
## 15 JobRoleManufacturing Director -0.0282707270
## 16 JobRoleResearch Director -0.5589679652
## 17 JobRoleSales Representative 0.8125101982
## 18 JobSatisfaction -0.2201139436
## 19 MaritalStatusSingle 0.6026889077
## 20 NumCompaniesWorked 0.0890748733
## 21 OverTimeYes 1.3690621543
## 22 RelationshipSatisfaction -0.0564621601
## 23 StockOptionLevel -0.1346097156
## 24 TotalWorkingYears -0.0324705552
## 25 TrainingTimesLastYear -0.0647294624
## 26 WorkLifeBalance -0.2452690840
## 27 YearsInCurrentRole -0.0427828135
## 28 YearsSinceLastPromotion 0.0753910193
## 29 YearsWithCurrManager -0.0263903970
# Get column indecis
cols.lasso.coef <- glm.lasso.new.coef@i
cols.lasso.coef <- cols.lasso.coef[-1] # Remove the intercept
train.reduce = emp_train[,cols.lasso.coef]
train.reduce = train.reduce[,-20]
#Assess Model
glm.assess <- glm(Attrition~.,data = train.reduce, family = "binomial")
summary(glm.assess)
##
## Call:
## glm(formula = Attrition ~ ., family = "binomial", data = train.reduce)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7261 -0.5498 -0.3276 -0.1633 3.0145
##
## Coefficients: (8 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.633e+00 1.450e+00 1.126 0.259981
## BusinessTravelTravel_Frequently 1.657e+00 4.328e-01 3.829 0.000129
## BusinessTravelTravel_Rarely 8.315e-01 4.039e-01 2.059 0.039535
## DepartmentResearch & Development 3.979e-03 6.060e-01 0.007 0.994761
## DepartmentSales 4.911e-01 6.244e-01 0.787 0.431563
## DistanceFromHome 3.036e-02 1.087e-02 2.792 0.005236
## Education 1.738e-02 9.186e-02 0.189 0.849927
## EducationFieldLife Sciences -4.156e-01 1.051e+00 -0.395 0.692639
## EducationFieldMarketing -3.083e-01 1.096e+00 -0.281 0.778580
## EducationFieldMedical -4.610e-01 1.058e+00 -0.436 0.662936
## EducationFieldOther -8.971e-01 1.116e+00 -0.804 0.421661
## EducationFieldTechnical Degree 3.297e-01 1.065e+00 0.310 0.756888
## EnvironmentSatisfaction -4.162e-01 8.755e-02 -4.754 2.00e-06
## HourlyRate 1.995e-03 4.623e-03 0.432 0.666020
## JobInvolvement -6.546e-01 1.289e-01 -5.078 3.82e-07
## JobSatisfaction -3.717e-01 8.411e-02 -4.419 9.91e-06
## MaritalStatusMarried 5.398e-01 2.858e-01 1.889 0.058957
## MaritalStatusSingle 1.436e+00 2.860e-01 5.019 5.19e-07
## MonthlyRate -9.948e-06 1.305e-05 -0.762 0.445887
## OverTimeYes 1.655e+00 1.955e-01 8.467 < 2e-16
## PerformanceRating -7.501e-02 2.627e-01 -0.286 0.775196
## RelationshipSatisfaction -1.630e-01 8.535e-02 -1.909 0.056247
## TotalWorkingYears -7.401e-02 1.935e-02 -3.825 0.000131
## TrainingTimesLastYear -1.861e-01 7.646e-02 -2.435 0.014907
## YearsAtCompany 5.778e-02 3.219e-02 1.795 0.072654
## YearsWithCurrManager -1.336e-01 4.629e-02 -2.885 0.003908
## `BusinessTravel_Non-Travel` NA NA NA NA
## `Department_Human Resources` NA NA NA NA
## `Department_Research & Development` NA NA NA NA
## Department_Sales NA NA NA NA
## `EducationField_Human Resources` NA NA NA NA
## EducationField_Medical NA NA NA NA
## `EducationField_Life Sciences` NA NA NA NA
## EducationField_Other NA NA NA NA
##
## (Intercept)
## BusinessTravelTravel_Frequently ***
## BusinessTravelTravel_Rarely *
## DepartmentResearch & Development
## DepartmentSales
## DistanceFromHome **
## Education
## EducationFieldLife Sciences
## EducationFieldMarketing
## EducationFieldMedical
## EducationFieldOther
## EducationFieldTechnical Degree
## EnvironmentSatisfaction ***
## HourlyRate
## JobInvolvement ***
## JobSatisfaction ***
## MaritalStatusMarried .
## MaritalStatusSingle ***
## MonthlyRate
## OverTimeYes ***
## PerformanceRating
## RelationshipSatisfaction .
## TotalWorkingYears ***
## TrainingTimesLastYear *
## YearsAtCompany .
## YearsWithCurrManager **
## `BusinessTravel_Non-Travel`
## `Department_Human Resources`
## `Department_Research & Development`
## Department_Sales
## `EducationField_Human Resources`
## EducationField_Medical
## `EducationField_Life Sciences`
## EducationField_Other
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1031.48 on 1169 degrees of freedom
## Residual deviance: 773.81 on 1144 degrees of freedom
## AIC: 825.81
##
## Number of Fisher Scoring iterations: 6
# Remove Non-Sign Variables
index <- c(2,4,7,9,10,11,13,15:19)
train.reduce.final <- train.reduce[,index]
glm.finalversion <- glm(train.reduce$Attrition~.,data = train.reduce.final, family = "binomial")
summary(glm.finalversion)
##
## Call:
## glm(formula = train.reduce$Attrition ~ ., family = "binomial",
## data = train.reduce.final)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7065 -0.5495 -0.3416 -0.1739 3.0886
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.26617 0.70396 1.799 0.072075 .
## BusinessTravelTravel_Frequently 1.62937 0.42961 3.793 0.000149 ***
## BusinessTravelTravel_Rarely 0.83255 0.40206 2.071 0.038387 *
## DistanceFromHome 0.02919 0.01074 2.717 0.006578 **
## EnvironmentSatisfaction -0.41460 0.08516 -4.868 1.12e-06 ***
## JobInvolvement -0.65002 0.12591 -5.162 2.44e-07 ***
## JobSatisfaction -0.37959 0.08266 -4.592 4.39e-06 ***
## MaritalStatusMarried 0.54855 0.28311 1.938 0.052680 .
## MaritalStatusSingle 1.44335 0.28297 5.101 3.38e-07 ***
## OverTimeYes 1.58614 0.19070 8.318 < 2e-16 ***
## RelationshipSatisfaction -0.16695 0.08406 -1.986 0.047030 *
## TotalWorkingYears -0.07556 0.01896 -3.985 6.74e-05 ***
## TrainingTimesLastYear -0.17441 0.07491 -2.328 0.019894 *
## YearsAtCompany 0.05781 0.03161 1.829 0.067389 .
## YearsWithCurrManager -0.12754 0.04492 -2.839 0.004519 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1031.48 on 1169 degrees of freedom
## Residual deviance: 790.81 on 1155 degrees of freedom
## AIC: 820.81
##
## Number of Fisher Scoring iterations: 6
#Remove Monthly Rate
train.reduce.final.version <- train.reduce.final[,-6]
#Reassess Model
glm.finalfinal <- glm(train.reduce$Attrition~.,data = train.reduce.final.version, family = "binomial")
summary(glm.finalfinal)
##
## Call:
## glm(formula = train.reduce$Attrition ~ ., family = "binomial",
## data = train.reduce.final.version)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7758 -0.5742 -0.3691 -0.2011 3.0544
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.10903 0.64902 3.250 0.00116 **
## BusinessTravelTravel_Frequently 1.65976 0.41250 4.024 5.73e-05 ***
## BusinessTravelTravel_Rarely 0.83381 0.38527 2.164 0.03045 *
## DistanceFromHome 0.02510 0.01045 2.402 0.01629 *
## EnvironmentSatisfaction -0.42533 0.08323 -5.110 3.22e-07 ***
## JobInvolvement -0.66063 0.12169 -5.429 5.68e-08 ***
## JobSatisfaction -0.35906 0.08072 -4.448 8.67e-06 ***
## OverTimeYes 1.51959 0.18471 8.227 < 2e-16 ***
## RelationshipSatisfaction -0.15570 0.08204 -1.898 0.05771 .
## TotalWorkingYears -0.07788 0.01879 -4.144 3.41e-05 ***
## TrainingTimesLastYear -0.16452 0.07313 -2.250 0.02447 *
## YearsAtCompany 0.04605 0.03081 1.494 0.13509
## YearsWithCurrManager -0.11105 0.04356 -2.550 0.01079 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1031.48 on 1169 degrees of freedom
## Residual deviance: 826.83 on 1157 degrees of freedom
## AIC: 852.83
##
## Number of Fisher Scoring iterations: 5
glm.finalfinal$coefficients
## (Intercept) BusinessTravelTravel_Frequently
## 2.10902537 1.65976133
## BusinessTravelTravel_Rarely DistanceFromHome
## 0.83381424 0.02510475
## EnvironmentSatisfaction JobInvolvement
## -0.42532783 -0.66062915
## JobSatisfaction OverTimeYes
## -0.35905872 1.51959223
## RelationshipSatisfaction TotalWorkingYears
## -0.15569624 -0.07788303
## TrainingTimesLastYear YearsAtCompany
## -0.16451544 0.04604716
## YearsWithCurrManager
## -0.11105360
exp(cbind(coef(glm.finalfinal), confint(glm.finalfinal)))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 8.2402062 2.2938743 29.4491357
## BusinessTravelTravel_Frequently 5.2580558 2.4356727 12.4089926
## BusinessTravelTravel_Rarely 2.3020827 1.1300236 5.1772887
## DistanceFromHome 1.0254225 1.0044848 1.0465530
## EnvironmentSatisfaction 0.6535555 0.5542340 0.7683807
## JobInvolvement 0.5165263 0.4059855 0.6545741
## JobSatisfaction 0.6983333 0.5953772 0.8173264
## OverTimeYes 4.5703612 3.1912666 6.5891055
## RelationshipSatisfaction 0.8558191 0.7284723 1.0052186
## TotalWorkingYears 0.9250726 0.8900473 0.9582470
## TrainingTimesLastYear 0.8483047 0.7334475 0.9772964
## YearsAtCompany 1.0471238 0.9838192 1.1109003
## YearsWithCurrManager 0.8948908 0.8222133 0.9757138
# Test Model on Test Dataset
test <- emp_test
test$final.prob <- predict.glm(glm.finalfinal,test[,-3],type="response")
test$final.predicted <- ifelse(test$final.prob>.5,"Yes","No")
Lassofinal <- confusionMatrix(table(test$final.predicted, test$Attrition))
Lassofinal
## Confusion Matrix and Statistics
##
##
## No Yes
## No 246 37
## Yes 5 12
##
## Accuracy : 0.86
## 95% CI : (0.8155, 0.8972)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.1548
##
## Kappa : 0.3052
## Mcnemar's Test P-Value : 1.724e-06
##
## Sensitivity : 0.9801
## Specificity : 0.2449
## Pos Pred Value : 0.8693
## Neg Pred Value : 0.7059
## Prevalence : 0.8367
## Detection Rate : 0.8200
## Detection Prevalence : 0.9433
## Balanced Accuracy : 0.6125
##
## 'Positive' Class : No
##
fourfoldplot(Lassofinal$table)
The following table outlines the four different models that were used in accuracy order. As you can tell, logistic regression was the most accurate; however, we recommend using the logistic regression using lasso because it is more efficient.
# Prediciton Models
# Review Prediciton Models
LogRegOnly # Log Regression
## Confusion Matrix and Statistics
##
## true
## pred No Yes
## No 244 32
## Yes 7 17
##
## Accuracy : 0.87
## 95% CI : (0.8266, 0.9059)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.0658297
##
## Kappa : 0.4015
## Mcnemar's Test P-Value : 0.0001215
##
## Sensitivity : 0.9721
## Specificity : 0.3469
## Pos Pred Value : 0.8841
## Neg Pred Value : 0.7083
## Prevalence : 0.8367
## Detection Rate : 0.8133
## Detection Prevalence : 0.9200
## Balanced Accuracy : 0.6595
##
## 'Positive' Class : No
##
Lassofinal # LogRessions with Lasso
## Confusion Matrix and Statistics
##
##
## No Yes
## No 246 37
## Yes 5 12
##
## Accuracy : 0.86
## 95% CI : (0.8155, 0.8972)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.1548
##
## Kappa : 0.3052
## Mcnemar's Test P-Value : 1.724e-06
##
## Sensitivity : 0.9801
## Specificity : 0.2449
## Pos Pred Value : 0.8693
## Neg Pred Value : 0.7059
## Prevalence : 0.8367
## Detection Rate : 0.8200
## Detection Prevalence : 0.9433
## Balanced Accuracy : 0.6125
##
## 'Positive' Class : No
##
knnPrediction # kNN
## Confusion Matrix and Statistics
##
##
## knn.test No Yes
## No 251 48
## Yes 0 1
##
## Accuracy : 0.84
## 95% CI : (0.7935, 0.8796)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.4758
##
## Kappa : 0.0337
## Mcnemar's Test P-Value : 1.17e-11
##
## Sensitivity : 1.00000
## Specificity : 0.02041
## Pos Pred Value : 0.83946
## Neg Pred Value : 1.00000
## Prevalence : 0.83667
## Detection Rate : 0.83667
## Detection Prevalence : 0.99667
## Balanced Accuracy : 0.51020
##
## 'Positive' Class : No
##
kWeightedPrediction # K Weighted
## Confusion Matrix and Statistics
##
##
## prediction No Yes
## No 251 47
## Yes 0 2
##
## Accuracy : 0.8433
## 95% CI : (0.7972, 0.8826)
## No Information Rate : 0.8367
## P-Value [Acc > NIR] : 0.4139
##
## Kappa : 0.0665
## Mcnemar's Test P-Value : 1.949e-11
##
## Sensitivity : 1.00000
## Specificity : 0.04082
## Pos Pred Value : 0.84228
## Neg Pred Value : 1.00000
## Prevalence : 0.83667
## Detection Rate : 0.83667
## Detection Prevalence : 0.99333
## Balanced Accuracy : 0.52041
##
## 'Positive' Class : No
##
# Create Prediction Summary Table
dt0 <- data.frame(cbind(t(LogRegOnly$overall),t(LogRegOnly$byClass)))
dt0$Type <- as.character("LogRegOnly")
dt1 <- data.frame(cbind(t(knnPrediction$overall),t(knnPrediction$byClass)))
dt1$Type <- as.character("kNN")
dt3 <- data.frame(cbind(t(kWeightedPrediction$overall),t(kWeightedPrediction$byClass)))
dt3$Type <- as.character("kWeighted")
dt4 <- data.frame(cbind(t(Lassofinal$overall),t(Lassofinal$byClass)))
dt4$Type <- as.character("Lassofinal")
SummaryPred <-rbind(dt0, dt1, dt3, dt4)
SummaryPred <- SummaryPred[order(-SummaryPred$Accuracy),]
SummaryPred <- SummaryPred[,c(19,1:18)]
SummaryT <- SummaryPred[,c(1,2,9, 10)]
#SummaryPred %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "100%", height = "200px")
SummaryT %>% kable() %>% kable_styling(bootstrap_options = "striped", full_width = F) %>% scroll_box(width = "100%", height = "200px")
| Type | Accuracy | Sensitivity | Specificity | |
|---|---|---|---|---|
| 1 | LogRegOnly | 0.8700000 | 0.9721116 | 0.3469388 |
| 4 | Lassofinal | 0.8600000 | 0.9800797 | 0.2448980 |
| 3 | kWeighted | 0.8433333 | 1.0000000 | 0.0408163 |
| 2 | kNN | 0.8400000 | 1.0000000 | 0.0204082 |
#dfTrain <- read.csv("E:/Documents/School/MSDS 6306/Case Study 2/CaseStudy2-data.csv", na.strings = "Null")
#dfVal <- read.csv("E:/Documents/School/MSDS 6306/Case Study 2/CaseStudy2Validation.csv", na.strings = "Null")
#Grant Path
#dfTrain <- read.csv("CaseStudy2-data.csv", na.strings = "NULL")
#dfVal <- read.csv("CaseStudy2Validation.csv", na.strings = "NULL")
#dfTrain2 <- fastDummies::dummy_cols(dfTrain) # Create Dummy Variables
#dfVal2 <- fastDummies::dummy_cols(dfVal) # Create Dummy
#Col.KNN <- c(2,5,7,8,10,11,12,14,15,16,18,20,21,22,25,26,27,28,29,30,31,32,33,34,35,36,40,41,42,43,44,45,46,47,48,49,50,51,52,53,54,55,56,57,58,59,60,61,62,63,64,65,66,67,68)
# KNN
#set.seed(1)
#fit = train(Attrition~., data=dfTrain2[,c(cols.CatKNN)], method="knn")
#dfVal$Predictions = predict(fit,dfVal2)
#dfPreds = data.frame(dfVal$ID,dfVal$Predictions)
#colnames(dfPreds) = c("ID","Prediction")
#dfPreds
write.csv(dfPreds,file = "LabelPrediction.csv",row.names = FALSE)